import plotly.graph_objects as go
import pandas as pd
import plotly.express as px
import numpy as np
import scipy as sy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.units as munits
import matplotlib as mpl
from scipy.special import ndtri
import warnings
import scipy.stats as st
import statsmodels.api as sm
import matplotlib
df = pd.read_csv('Time series_modified.csv', header=2, sep=',') #read timeseries data from Inagro agrodig
df
| Hour | Day | Vedows_grass combo | Actual_grass | PM | Total | Biogas | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 26/03/2020 | 57.0 | 21.09 | NaN | 57 | 2.89 |
| 1 | 1 | 26/03/2020 | 39.0 | 14.43 | NaN | 39 | 2.89 |
| 2 | 2 | 26/03/2020 | 58.0 | 21.46 | NaN | 58 | 2.90 |
| 3 | 3 | 26/03/2020 | 67.0 | 24.79 | NaN | 67 | 2.93 |
| 4 | 4 | 26/03/2020 | 39.0 | 14.43 | NaN | 39 | 2.96 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 979 | 19 | 5/05/2020 | NaN | 0.00 | 125.0 | 125 | 7.55 |
| 980 | 20 | 5/05/2020 | NaN | 0.00 | NaN | 0 | 7.55 |
| 981 | 21 | 5/05/2020 | NaN | 0.00 | 125.0 | 125 | 3.86 |
| 982 | 22 | 5/05/2020 | NaN | 0.00 | NaN | 0 | 2.68 |
| 983 | 23 | 5/05/2020 | NaN | 0.00 | 125.0 | 125 | 7.15 |
984 rows × 7 columns
#creating a dataframe with timestamp
t_index = pd.DatetimeIndex(pd.date_range(start='2020-03-26', end='2020-05-05 23:00:00', freq="1h"))
time = pd.DataFrame(t_index, columns=['hour'])
time
| hour | |
|---|---|
| 0 | 2020-03-26 00:00:00 |
| 1 | 2020-03-26 01:00:00 |
| 2 | 2020-03-26 02:00:00 |
| 3 | 2020-03-26 03:00:00 |
| 4 | 2020-03-26 04:00:00 |
| ... | ... |
| 979 | 2020-05-05 19:00:00 |
| 980 | 2020-05-05 20:00:00 |
| 981 | 2020-05-05 21:00:00 |
| 982 | 2020-05-05 22:00:00 |
| 983 | 2020-05-05 23:00:00 |
984 rows × 1 columns
extracted_col = df[df.columns[2:9]]
extracted_col
| Vedows_grass combo | Actual_grass | PM | Total | Biogas | |
|---|---|---|---|---|---|
| 0 | 57.0 | 21.09 | NaN | 57 | 2.89 |
| 1 | 39.0 | 14.43 | NaN | 39 | 2.89 |
| 2 | 58.0 | 21.46 | NaN | 58 | 2.90 |
| 3 | 67.0 | 24.79 | NaN | 67 | 2.93 |
| 4 | 39.0 | 14.43 | NaN | 39 | 2.96 |
| ... | ... | ... | ... | ... | ... |
| 979 | NaN | 0.00 | 125.0 | 125 | 7.55 |
| 980 | NaN | 0.00 | NaN | 0 | 7.55 |
| 981 | NaN | 0.00 | 125.0 | 125 | 3.86 |
| 982 | NaN | 0.00 | NaN | 0 | 2.68 |
| 983 | NaN | 0.00 | 125.0 | 125 | 7.15 |
984 rows × 5 columns
df1 = time.join(extracted_col)
df1.columns
Index(['hour', 'Vedows_grass combo', 'Actual_grass', 'PM', 'Total', 'Biogas'], dtype='object')
df1['hour'] = pd.to_datetime(time['hour'])
df1
| hour | Vedows_grass combo | Actual_grass | PM | Total | Biogas | |
|---|---|---|---|---|---|---|
| 0 | 2020-03-26 00:00:00 | 57.0 | 21.09 | NaN | 57 | 2.89 |
| 1 | 2020-03-26 01:00:00 | 39.0 | 14.43 | NaN | 39 | 2.89 |
| 2 | 2020-03-26 02:00:00 | 58.0 | 21.46 | NaN | 58 | 2.90 |
| 3 | 2020-03-26 03:00:00 | 67.0 | 24.79 | NaN | 67 | 2.93 |
| 4 | 2020-03-26 04:00:00 | 39.0 | 14.43 | NaN | 39 | 2.96 |
| ... | ... | ... | ... | ... | ... | ... |
| 979 | 2020-05-05 19:00:00 | NaN | 0.00 | 125.0 | 125 | 7.55 |
| 980 | 2020-05-05 20:00:00 | NaN | 0.00 | NaN | 0 | 7.55 |
| 981 | 2020-05-05 21:00:00 | NaN | 0.00 | 125.0 | 125 | 3.86 |
| 982 | 2020-05-05 22:00:00 | NaN | 0.00 | NaN | 0 | 2.68 |
| 983 | 2020-05-05 23:00:00 | NaN | 0.00 | 125.0 | 125 | 7.15 |
984 rows × 6 columns
daily = df1.resample('D', on='hour').sum()
daily.describe()
#daily.to_csv('TS_inagro_disagg.csv', sep=',')
| Vedows_grass combo | Actual_grass | PM | Total | Biogas | |
|---|---|---|---|---|---|
| count | 41.000000 | 41.000000 | 41.000000 | 41.000000 | 41.000000 |
| mean | 895.365854 | 331.285366 | 1037.560976 | 1932.926829 | 105.335610 |
| std | 690.628509 | 255.532548 | 759.257139 | 1108.583655 | 31.822892 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 58.940000 |
| 25% | 0.000000 | 0.000000 | 575.000000 | 1365.000000 | 72.050000 |
| 50% | 1125.000000 | 416.250000 | 1315.000000 | 2376.000000 | 115.330000 |
| 75% | 1464.000000 | 541.680000 | 1380.000000 | 2628.000000 | 133.020000 |
| max | 1925.000000 | 712.250000 | 3920.000000 | 5151.000000 | 156.860000 |
df2 = daily.reset_index()
df2.rename({'hour': 'Day'}, axis=1, inplace=True)
df2.describe()
| Vedows_grass combo | Actual_grass | PM | Total | Biogas | |
|---|---|---|---|---|---|
| count | 41.000000 | 41.000000 | 41.000000 | 41.000000 | 41.000000 |
| mean | 895.365854 | 331.285366 | 1037.560976 | 1932.926829 | 105.335610 |
| std | 690.628509 | 255.532548 | 759.257139 | 1108.583655 | 31.822892 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 58.940000 |
| 25% | 0.000000 | 0.000000 | 575.000000 | 1365.000000 | 72.050000 |
| 50% | 1125.000000 | 416.250000 | 1315.000000 | 2376.000000 | 115.330000 |
| 75% | 1464.000000 | 541.680000 | 1380.000000 | 2628.000000 | 133.020000 |
| max | 1925.000000 | 712.250000 | 3920.000000 | 5151.000000 | 156.860000 |
df_cumulative = daily[daily.columns[0:9]].cumsum()
#df_cumulative.to_csv('TS_inagro.csv', sep=',')
df_cumulative.columns
Index(['Vedows_grass combo', 'Actual_grass', 'PM', 'Total', 'Biogas'], dtype='object')
df_cumulative.describe()
| Vedows_grass combo | Actual_grass | PM | Total | Biogas | |
|---|---|---|---|---|---|
| count | 41.000000 | 41.000000 | 41.000000 | 41.000000 | 41.000000 |
| mean | 13404.853659 | 4959.795854 | 20138.170732 | 33543.024390 | 1884.378293 |
| std | 12647.412895 | 4679.542771 | 13798.429927 | 26091.855225 | 1301.674017 |
| min | 1427.000000 | 527.990000 | 0.000000 | 1427.000000 | 70.240000 |
| 25% | 1486.000000 | 549.820000 | 7860.000000 | 9346.000000 | 772.290000 |
| 50% | 9220.000000 | 3411.400000 | 20430.000000 | 29650.000000 | 1642.310000 |
| 75% | 24338.000000 | 9005.060000 | 33990.000000 | 58328.000000 | 2953.240000 |
| max | 36710.000000 | 13582.700000 | 42540.000000 | 79250.000000 | 4318.760000 |
df2.columns
df2.describe()
| Vedows_grass combo | Actual_grass | PM | Total | Biogas | |
|---|---|---|---|---|---|
| count | 41.000000 | 41.000000 | 41.000000 | 41.000000 | 41.000000 |
| mean | 895.365854 | 331.285366 | 1037.560976 | 1932.926829 | 105.335610 |
| std | 690.628509 | 255.532548 | 759.257139 | 1108.583655 | 31.822892 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 58.940000 |
| 25% | 0.000000 | 0.000000 | 575.000000 | 1365.000000 | 72.050000 |
| 50% | 1125.000000 | 416.250000 | 1315.000000 | 2376.000000 | 115.330000 |
| 75% | 1464.000000 | 541.680000 | 1380.000000 | 2628.000000 | 133.020000 |
| max | 1925.000000 | 712.250000 | 3920.000000 | 5151.000000 | 156.860000 |
df2
| Day | Vedows_grass combo | Actual_grass | PM | Total | Biogas | |
|---|---|---|---|---|---|---|
| 0 | 2020-03-26 | 1427.0 | 527.99 | 0.0 | 1427 | 70.24 |
| 1 | 2020-03-27 | 59.0 | 21.83 | 0.0 | 59 | 71.27 |
| 2 | 2020-03-28 | 0.0 | 0.00 | 0.0 | 0 | 72.05 |
| 3 | 2020-03-29 | 0.0 | 0.00 | 960.0 | 960 | 73.79 |
| 4 | 2020-03-30 | 0.0 | 0.00 | 805.0 | 805 | 73.41 |
| 5 | 2020-03-31 | 0.0 | 0.00 | 0.0 | 0 | 73.07 |
| 6 | 2020-04-01 | 0.0 | 0.00 | 575.0 | 575 | 72.18 |
| 7 | 2020-04-02 | 0.0 | 0.00 | 1380.0 | 1380 | 70.03 |
| 8 | 2020-04-03 | 0.0 | 0.00 | 1380.0 | 1380 | 68.23 |
| 9 | 2020-04-04 | 0.0 | 0.00 | 1380.0 | 1380 | 65.45 |
| 10 | 2020-04-05 | 0.0 | 0.00 | 1380.0 | 1380 | 62.57 |
| 11 | 2020-04-06 | 0.0 | 0.00 | 1380.0 | 1380 | 61.62 |
| 12 | 2020-04-07 | 0.0 | 0.00 | 720.0 | 720 | 60.76 |
| 13 | 2020-04-08 | 0.0 | 0.00 | 0.0 | 0 | 58.94 |
| 14 | 2020-04-09 | 0.0 | 0.00 | 640.0 | 640 | 59.72 |
| 15 | 2020-04-10 | 951.0 | 351.87 | 1610.0 | 2561 | 83.45 |
| 16 | 2020-04-11 | 1231.0 | 455.47 | 3920.0 | 5151 | 101.71 |
| 17 | 2020-04-12 | 1026.0 | 379.62 | 1380.0 | 2406 | 108.19 |
| 18 | 2020-04-13 | 1302.0 | 481.74 | 1380.0 | 2682 | 104.55 |
| 19 | 2020-04-14 | 1698.0 | 628.26 | 690.0 | 2388 | 115.33 |
| 20 | 2020-04-15 | 1526.0 | 564.62 | 850.0 | 2376 | 115.75 |
| 21 | 2020-04-16 | 1435.0 | 530.95 | 1380.0 | 2815 | 119.62 |
| 22 | 2020-04-17 | 1715.0 | 634.55 | 1380.0 | 3095 | 127.92 |
| 23 | 2020-04-18 | 1578.0 | 583.86 | 1610.0 | 3188 | 133.00 |
| 24 | 2020-04-19 | 1103.0 | 408.11 | 1380.0 | 2483 | 127.37 |
| 25 | 2020-04-20 | 1925.0 | 712.25 | 1610.0 | 3535 | 140.70 |
| 26 | 2020-04-21 | 1289.0 | 476.93 | 1610.0 | 2899 | 143.93 |
| 27 | 2020-04-22 | 1239.0 | 458.43 | 1275.0 | 2514 | 137.03 |
| 28 | 2020-04-23 | 1586.0 | 586.82 | 900.0 | 2486 | 133.23 |
| 29 | 2020-04-24 | 1775.0 | 656.75 | 1315.0 | 3090 | 135.36 |
| 30 | 2020-04-25 | 1473.0 | 545.01 | 1100.0 | 2573 | 112.77 |
| 31 | 2020-04-26 | 1785.0 | 660.45 | 435.0 | 2220 | 156.86 |
| 32 | 2020-04-27 | 1532.0 | 566.84 | 0.0 | 1532 | 133.02 |
| 33 | 2020-04-28 | 1464.0 | 541.68 | 0.0 | 1464 | 129.20 |
| 34 | 2020-04-29 | 1365.0 | 505.05 | 0.0 | 1365 | 132.76 |
| 35 | 2020-04-30 | 1125.0 | 416.25 | 125.0 | 1250 | 127.42 |
| 36 | 2020-05-01 | 1161.0 | 429.57 | 1500.0 | 2661 | 129.51 |
| 37 | 2020-05-02 | 964.0 | 356.68 | 1625.0 | 2589 | 136.15 |
| 38 | 2020-05-03 | 878.0 | 324.86 | 1750.0 | 2628 | 134.91 |
| 39 | 2020-05-04 | 1169.0 | 432.53 | 1500.0 | 2669 | 136.85 |
| 40 | 2020-05-05 | 929.0 | 343.73 | 1615.0 | 2544 | 148.84 |
fig = px.line(df2, x=df2.Day, y=df2.Biogas, title="Daily biogas production")
fig.show()
fig1 = px.scatter(df2, x='Day', y='Total', color='Biogas', size='Actual_grass')
fig1.show()
fig = px.scatter(df2, x='Day', y='Biogas', color='Total', size='Actual_grass')
fig.show()
fig2 = px.line(df2, x='Day', y='Total')
fig2.show()
data = df2.drop(['Day', 'Vedows_grass combo', 'Actual_grass', 'PM', 'Total'], axis=1)
data
| Biogas | |
|---|---|
| 0 | 70.24 |
| 1 | 71.27 |
| 2 | 72.05 |
| 3 | 73.79 |
| 4 | 73.41 |
| 5 | 73.07 |
| 6 | 72.18 |
| 7 | 70.03 |
| 8 | 68.23 |
| 9 | 65.45 |
| 10 | 62.57 |
| 11 | 61.62 |
| 12 | 60.76 |
| 13 | 58.94 |
| 14 | 59.72 |
| 15 | 83.45 |
| 16 | 101.71 |
| 17 | 108.19 |
| 18 | 104.55 |
| 19 | 115.33 |
| 20 | 115.75 |
| 21 | 119.62 |
| 22 | 127.92 |
| 23 | 133.00 |
| 24 | 127.37 |
| 25 | 140.70 |
| 26 | 143.93 |
| 27 | 137.03 |
| 28 | 133.23 |
| 29 | 135.36 |
| 30 | 112.77 |
| 31 | 156.86 |
| 32 | 133.02 |
| 33 | 129.20 |
| 34 | 132.76 |
| 35 | 127.42 |
| 36 | 129.51 |
| 37 | 136.15 |
| 38 | 134.91 |
| 39 | 136.85 |
| 40 | 148.84 |
# Create models from data
def best_fit_distribution(data, bins=200, ax=None):
"""Model data by finding best fit distribution to data"""
# Get histogram of original data
y, x = np.histogram(data, bins=bins, density=True)
x = (x + np.roll(x, -1))[:-1] / 2.0
# Distributions to check
DISTRIBUTIONS = [
st.dgamma,st.dweibull,st.expon,st.exponnorm,st.gamma,st.lognorm,st.norm,st.t,st.triang,
st.uniform
]
# Best holders
best_distribution = st.norm
best_params = (0.0, 1.0)
best_sse = np.inf
# Estimate distribution parameters from data
for distribution in DISTRIBUTIONS:
# Try to fit the distribution
try:
# Ignore warnings from data that can't be fit
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
# fit dist to data
params = distribution.fit(data)
# Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]
# Calculate fitted PDF and error with fit in distribution
pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
sse = np.sum(np.power(y - pdf, 2.0))
# if axis pass in add to plot
try:
if ax:
pd.Series(pdf, x).plot(ax=ax)
end
except Exception:
pass
# identify if this distribution is better
if best_sse > sse > 0:
best_distribution = distribution
best_params = params
best_sse = sse
except Exception:
pass
return (best_distribution.name, best_params)
def make_pdf(dist, params, size=10000):
"""Generate distributions's Probability Distribution Function """
# Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]
# Get sane start and end points of distribution
start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)
# Build PDF and turn into pandas Series
x = np.linspace(start, end, size)
y = dist.pdf(x, loc=loc, scale=scale, *arg)
pdf = pd.Series(y, x)
return pdf
# Plot for comparison
plt.figure(figsize=(20,15))
ax = data.plot(kind='hist', bins=50, density=True, alpha=0.5, color = [color['color'] for color in list(mpl.rcParams['axes.prop_cycle'])])
# Save plot limits
dataYLim = ax.get_ylim()
# Find best fit distribution
best_fit_name, best_fit_params = best_fit_distribution(data, 200, ax)
best_dist = getattr(st, best_fit_name)
# Update plots
ax.set_ylim(dataYLim)
ax.set_title(u'Biogas yield.\n Vanheede ')
ax.set_xlabel(u'Biogas yield (m3/day)')
ax.set_ylabel('Frequency')
# Make PDF with best params
pdf = make_pdf(best_dist, best_fit_params)
# Display
plt.figure(figsize=(20,15))
ax = pdf.plot(lw=2, label='PDF', legend=True)
data.plot(kind='hist', bins=50, density=True, alpha=0.5, label='Data', legend=True, ax=ax)
param_names = (best_dist.shapes + ', loc, scale').split(', ') if best_dist.shapes else ['loc', 'scale']
param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, best_fit_params)])
dist_str = '{}({})'.format(best_fit_name, param_str)
ax.set_title(u'Biogas yield. with best fit distribution \n' + dist_str)
ax.set_xlabel(u'Biogas yield (m3/day)')
ax.set_ylabel('Frequency')
Text(0, 0.5, 'Frequency')
<Figure size 1440x1080 with 0 Axes>